1 Define Parameters

# Paths
path_to_metadata <- "../01-cellranger_mapping/data/tonsil_atlas_metadata.csv"
path_to_save_obj <- "../results"
rds_obj <- str_c(
  path_to_save_obj,
  "seurat_object_cite_seq_postQC.rds",
  sep = "/"
)
path_to_scrublet <- "../02-scrublet_doublet_detection/results"
path_to_vireo_genotype <- "../03-demultiplexing_genotype/2-vireo"
path_to_cellranger_output <- "../01-cellranger_mapping/projects"

2 Define threshold

min_count_33 <- 600
max_count_33 <- 12000
min_feat_33 <- 300
max_feat_33 <- 3000
max_mt_33 <- 15
min_count_38 <- 300
max_count_38 <- 6000
min_feat_38 <- 200
max_feat_38 <- 2000
max_mt_38 <- 15
min_count_40 <- 400
max_count_40 <- 6000
min_feat_40 <- 250
max_feat_40 <- 2250
max_mt_40 <- 10
min_count_46 <- 350
max_count_46 <- 6000
min_feat_46 <- 300
max_feat_46 <- 2500
max_mt_46 <- 15
min_counts <- c(min_count_33, min_count_38, min_count_40, min_count_46)
max_counts <- c(max_count_33, max_count_38, max_count_40, max_count_46)
min_feats <- c(min_feat_33, min_feat_38, min_feat_40, min_feat_46)
max_feats <- c(max_feat_33, max_feat_38, max_feat_40, max_feat_46)
max_mt <- c(max_mt_33, max_mt_38, max_mt_40, max_mt_46)
max_genes <- 10

3 Set up the Seurat Object

GEX and CITEseq (ADT) dataset is integrated to set up the seurat object for the project

## Rows: 60 Columns: 6
## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): subproject, gem_id, library_name, type, donor_id
## dbl (1): library_id
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## 10X data contains more than one type and is being returned as a list containing matrices of each type.
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## 10X data contains more than one type and is being returned as a list containing matrices of each type.
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## 10X data contains more than one type and is being returned as a list containing matrices of each type.
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## 10X data contains more than one type and is being returned as a list containing matrices of each type.
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## 10X data contains more than one type and is being returned as a list containing matrices of each type.
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## 10X data contains more than one type and is being returned as a list containing matrices of each type.
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## 10X data contains more than one type and is being returned as a list containing matrices of each type.
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## 10X data contains more than one type and is being returned as a list containing matrices of each type.
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## 10X data contains more than one type and is being returned as a list containing matrices of each type.
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## 10X data contains more than one type and is being returned as a list containing matrices of each type.
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## 10X data contains more than one type and is being returned as a list containing matrices of each type.
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## 10X data contains more than one type and is being returned as a list containing matrices of each type.
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## 10X data contains more than one type and is being returned as a list containing matrices of each type.
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## 10X data contains more than one type and is being returned as a list containing matrices of each type.
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## 10X data contains more than one type and is being returned as a list containing matrices of each type.
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## 
## BCLLATLAS_33_ifZOgenn_TpMNTvBa BCLLATLAS_33_mLuLpVxi_v0fLyotc 
##                           2947                           2649 
## BCLLATLAS_38_B20O1bh7_VmM99YZJ BCLLATLAS_38_rdFRFhrU_ZdYeOZlf 
##                           2269                           2565 
## BCLLATLAS_38_WToIzInl_LudU7hVX BCLLATLAS_40_CfdzDgHe_IMOTbrIP 
##                           1855                           2799 
## BCLLATLAS_40_LxBTpkPO_8TcNpBg4 BCLLATLAS_40_pseMjZsU_qgZNOhOQ 
##                           3016                           3022 
## BCLLATLAS_40_ujxNn2kq_lG2VLlYd BCLLATLAS_40_uqJAc4r9_BYScOzxA 
##                           3098                           3125 
## BCLLATLAS_46_BZEECBEG_GXkc6Q1y BCLLATLAS_46_HjqdPU0E_aGDmEY5F 
##                           4191                           3166 
## BCLLATLAS_46_KETfaLdx_Ub1mtE13 BCLLATLAS_46_SOJZt9kY_qpnv20QN 
##                           4498                           3574 
## BCLLATLAS_46_XV1SLOR2_HRF5D9A3 
##                           3571

4 Calculate mitochondrial percent and Add number of genes per UMI for each cell to metadata

bcll.combined[["percent.mt"]] <- PercentageFeatureSet(bcll.combined, pattern = "^MT-")

bcll.combined$log10GenesPerUMI <- log10(bcll.combined$nFeature_RNA) / log10(bcll.combined$nCount_RNA)

subproject_seurat_obj_list <- SplitObject(bcll.combined, split.by = "subproject")

QC_summary <- bcll.combined@meta.data %>% group_by(orig.ident) %>% summarise(Total_cells = n(), Mean_RNA_count= mean(nCount_RNA), Mean_feature_count = mean(nFeature_RNA), Mean_mito_percent = mean(percent.mt), Mean_log10GenesPerUMI = mean(log10GenesPerUMI) ,Number_of_scrublet_doublet = sum(scrublet_predicted_doublet == "True"), Number_of_genotype_doublet = sum(genotype_based_doublet_flag == "T") , Number_of_genotype_unassigned = sum(genotype_based_unassigned_flag == "T") ,Number_of_TCR = sum(tcr_flag == "T", na.rm = T) , Number_of_BCR = sum(bcr_flag == "T", na.rm = T) , Number_of_mait_cells = sum(mait_evidence %in% c("TRB:gene","TRA:gene+junction;TRB:gene","TRA:gene","TRA:gene;TRB:gene","TRA:junction;TRB:gene","TRA:junction","TRA:gene+junction"), na.rm = T), Number_of_inkt_cells =sum(inkt_evidence %in% c("TRB:gene","TRA:gene;TRB:gene","TRA:gene"), na.rm = T), .groups = 'drop')

metadata <- bcll.combined@meta.data
library(knitr)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
kable(QC_summary) %>%
  kable_styling("striped", full_width = T)
orig.ident Total_cells Mean_RNA_count Mean_feature_count Mean_mito_percent Mean_log10GenesPerUMI Number_of_scrublet_doublet Number_of_genotype_doublet Number_of_genotype_unassigned Number_of_TCR Number_of_BCR Number_of_mait_cells Number_of_inkt_cells
BCLLATLAS_33_ifZOgenn_TpMNTvBa 2947 2372.930 979.3444 3.265162 0.8934118 4 0 0 330 1166 79 3
BCLLATLAS_33_mLuLpVxi_v0fLyotc 2649 3043.896 1163.6436 3.196555 0.8888973 1 0 0 304 1362 72 0
BCLLATLAS_38_B20O1bh7_VmM99YZJ 2269 1016.181 496.9912 5.202306 0.9012692 3 127 150 206 590 49 1
BCLLATLAS_38_rdFRFhrU_ZdYeOZlf 2565 1306.379 601.7649 4.409707 0.8988821 2 175 96 237 822 56 2
BCLLATLAS_38_WToIzInl_LudU7hVX 1855 1020.876 492.7003 4.742993 0.9001380 1 87 71 316 464 59 2
BCLLATLAS_40_CfdzDgHe_IMOTbrIP 2799 1472.776 662.8910 2.992291 0.8992132 7 0 0 280 900 52 3
BCLLATLAS_40_LxBTpkPO_8TcNpBg4 3016 1452.220 687.0802 2.667517 0.9061708 1 0 0 290 1078 66 1
BCLLATLAS_40_pseMjZsU_qgZNOhOQ 3022 1386.118 645.4596 2.803393 0.9016561 9 0 0 293 1059 66 6
BCLLATLAS_40_ujxNn2kq_lG2VLlYd 3098 1270.212 615.1394 2.843212 0.9071128 1 0 0 288 1014 70 2
BCLLATLAS_40_uqJAc4r9_BYScOzxA 3125 1670.094 754.3158 2.619818 0.9019308 4 0 0 297 1038 61 3
BCLLATLAS_46_BZEECBEG_GXkc6Q1y 4191 1653.451 624.2014 4.290005 0.8797365 230 0 0 1249 989 293 12
BCLLATLAS_46_HjqdPU0E_aGDmEY5F 3166 1390.643 596.7274 3.802479 0.8916281 5 0 0 720 503 158 5
BCLLATLAS_46_KETfaLdx_Ub1mtE13 4498 1678.511 666.0965 3.997394 0.8866078 179 0 0 1626 1476 383 10
BCLLATLAS_46_SOJZt9kY_qpnv20QN 3574 1770.673 712.5193 3.834415 0.8884291 4 0 0 817 761 212 3
BCLLATLAS_46_XV1SLOR2_HRF5D9A3 3571 1915.572 778.2716 3.465346 0.8927950 10 0 0 1005 1072 232 8

5 Check the doublets using Genotype and scrublet

Cross table to check the doublet called by scrublet and by the genotype package

table(metadata$donor_id, metadata$scrublet_predicted_doublet)
##                               
##                                False  True
##   B20O1bh7_VmM99YZJ_donor0       950     0
##   B20O1bh7_VmM99YZJ_donor1       255     1
##   B20O1bh7_VmM99YZJ_donor2       461     0
##   B20O1bh7_VmM99YZJ_donor3       325     0
##   B20O1bh7_VmM99YZJ_doublet      125     2
##   B20O1bh7_VmM99YZJ_unassigned   150     0
##   BCLL-15-T                    18572   428
##   BCLL-8-T                      5591     5
##   BCLL-9-T                     15038    22
##   rdFRFhrU_ZdYeOZlf_donor0       344     0
##   rdFRFhrU_ZdYeOZlf_donor1       326     0
##   rdFRFhrU_ZdYeOZlf_donor2      1068     0
##   rdFRFhrU_ZdYeOZlf_donor3       556     0
##   rdFRFhrU_ZdYeOZlf_doublet      173     2
##   rdFRFhrU_ZdYeOZlf_unassigned    96     0
##   WToIzInl_LudU7hVX_donor0       418     0
##   WToIzInl_LudU7hVX_donor1       319     0
##   WToIzInl_LudU7hVX_donor2       572     0
##   WToIzInl_LudU7hVX_donor3       388     0
##   WToIzInl_LudU7hVX_doublet       86     1
##   WToIzInl_LudU7hVX_unassigned    71     0

6 Cell counts

The cell counts are determined by the number of unique cellular barcodes detected.

# Visualize the number of cell counts per sample
plot_cell_count <- function(obj){
  metadata_subset <- obj@meta.data  
  p1 <- metadata_subset %>% 
    ggplot(aes(x=sample, fill=sample)) + 
    geom_bar() +
    theme_classic() +
    theme(plot.title = element_text(hjust=0.5, face="bold")) +
    ggtitle("NCells") +
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())
}  
plot_list <- lapply(subproject_seurat_obj_list, plot_cell_count)
plot_list[1]  
## $BCLLATLAS_33

plot_list[2]
## $BCLLATLAS_38

plot_list[3]
## $BCLLATLAS_40

plot_list[4]
## $BCLLATLAS_46

7 UMI counts (transcripts) per cell

# Visualize the number Feature per cell

feature_per_cell_plot <- function(x){
  obj <- subproject_seurat_obj_list[[x]]
    metadata_subset <- obj@meta.data    
    p2 <- metadata_subset %>% 
        ggplot(aes(color=sample, x=nFeature_RNA, fill= sample)) + 
        geom_density(alpha = 0.2) + 
        scale_x_log10() + 
        theme_classic() +
        ylab("Cell density") +
        geom_vline(xintercept = c(min_feats[[x]], max_feats[[x]]))
}

vec=c(1,2,3,4)

plot_list <- lapply(vec,feature_per_cell_plot)
plot_list[1]  
## [[1]]

plot_list[2]
## [[1]]

plot_list[3]
## [[1]]

plot_list[4]
## [[1]]

8 Gene detected per cell

# Visualize the number Gene per cell

gene_per_cell_plot <- function(x){
    obj <- subproject_seurat_obj_list[[x]]
  metadata_subset <- obj@meta.data  
    p3 <- metadata_subset %>% 
        ggplot(aes(color=sample, x=nCount_RNA, fill= sample)) + 
        geom_density(alpha = 0.2) + 
        scale_x_log10() + 
        theme_classic() +
        ylab("Cell density") +
        geom_vline(xintercept = c(min_counts[[x]], max_counts[[x]]))
}
plot_list <- lapply(vec, gene_per_cell_plot)
plot_list[1]  
## [[1]]

plot_list[2]
## [[1]]

plot_list[3]
## [[1]]

plot_list[4]
## [[1]]

9 Gene distribution

# Visualize the distribution of genes detected per cell via boxplot

gene_per_cell_boxplot <- function(obj){
    metadata_subset <- obj@meta.data    
    p4 <- metadata_subset %>% 
        ggplot(aes(x=sample, y=log10(nCount_RNA), fill=sample)) + 
        geom_boxplot() + 
        theme_classic() +
        theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
        theme(plot.title = element_text(hjust=0.5, face="bold")) +
        ggtitle("NCells vs NGenes")
}
plot_list <- lapply(subproject_seurat_obj_list, gene_per_cell_boxplot)
plot_list[1]  
## $BCLLATLAS_33

plot_list[2]
## $BCLLATLAS_38

plot_list[3]
## $BCLLATLAS_40

plot_list[4]
## $BCLLATLAS_46

10 Correlation Between genes, features and mito_percent

# Visualize the correlation between genes detected and number of UMIs and determine whether strong presence of cells with low numbers of genes/UMIs

correlation_plot <- function(obj){
    metadata_subset <- obj@meta.data    
    p5 <- metadata_subset %>% 
        ggplot(aes(x=nFeature_RNA, y=nCount_RNA, color=percent.mt)) + 
        geom_point() + 
        scale_colour_gradient(low = "gray90", high = "black") +
        stat_smooth(method=lm) +
        scale_x_log10() + 
        scale_y_log10() + 
        theme_classic() +
        geom_vline(xintercept = 500) +
        geom_hline(yintercept = 250) +
        facet_wrap(~sample)
}
plot_list <- lapply(subproject_seurat_obj_list, correlation_plot)
plot_list[1]  
## $BCLLATLAS_33
## `geom_smooth()` using formula 'y ~ x'

plot_list[2]
## $BCLLATLAS_38
## `geom_smooth()` using formula 'y ~ x'

plot_list[3]
## $BCLLATLAS_40
## `geom_smooth()` using formula 'y ~ x'

plot_list[4]
## $BCLLATLAS_46
## `geom_smooth()` using formula 'y ~ x'

11 Mitochondrial Counts Ratio

# Visualize the distribution of mitochondrial gene expression detected per cell
mt_plot <- function(x){
  obj <- subproject_seurat_obj_list[[x]]
    metadata_subset <- obj@meta.data    
    p6 <- metadata_subset %>% 
    ggplot(aes(color=sample, x=percent.mt, fill=sample)) + 
    geom_density(alpha = 15) + 
    theme_classic() +
    geom_vline(xintercept = max_mt[[x]])
}
vec <- c(1,2,3,4)
plot_list <- lapply(vec, mt_plot)

plot_list[1]  
## [[1]]

plot_list[2]
## [[1]]

plot_list[3]
## [[1]]

plot_list[4]
## [[1]]

12 Overall Complexity

# Visualize the overall complexity of the gene expression by visualizing the genes detected per UMI

complexity_plot <- function(obj){
    metadata_subset <- obj@meta.data    
    p7 <- metadata_subset %>% 
    ggplot(aes(x=log10GenesPerUMI, color = sample, fill=sample)) +
    geom_density(alpha = 0.2) +
    theme_classic() +
    geom_vline(xintercept = 0.8)
}
plot_list <- lapply(subproject_seurat_obj_list, complexity_plot)
plot_list[1]  
## $BCLLATLAS_33

plot_list[2]
## $BCLLATLAS_38

plot_list[3]
## $BCLLATLAS_40

plot_list[4]
## $BCLLATLAS_46

13 Violin Plot

VlnPlot(bcll.combined, features = "nCount_ADT",
  log = TRUE, pt.size = 0) + NoLegend()

VlnPlot(bcll.combined, features = "nCount_ADT",log = FALSE, pt.size = 0, y.max = 5000) + NoLegend()
## Warning: Removed 554 rows containing non-finite values (stat_ydensity).

VlnPlot(bcll.combined, features =  "nCount_RNA", log = TRUE, pt.size = 0) + NoLegend()

VlnPlot(bcll.combined, features = "percent.mt",
  log = TRUE, pt.size = 0) + NoLegend()

VlnPlot(bcll.combined, features = "scrublet_doublet_scores",
  log = TRUE, pt.size = 0) + NoLegend()

14 Filtering based on the QC results

# Filter out low quality reads using selected thresholds - these will change with experiment

subset_seurat_obj <- function(x) {
  subset_seurat <- subset(subproject_seurat_obj_list[[x]], subset = nFeature_RNA > min_feats[[x]] & nFeature_RNA < max_feats[[x]] & percent.mt < max_mt[[x]] & nCount_RNA > min_counts[[x]] & nCount_RNA < max_counts[[x]])
  return(subset_seurat)
}

subset_seurat_obj
## function(x) {
##   subset_seurat <- subset(subproject_seurat_obj_list[[x]], subset = nFeature_RNA > min_feats[[x]] & nFeature_RNA < max_feats[[x]] & percent.mt < max_mt[[x]] & nCount_RNA > min_counts[[x]] & nCount_RNA < max_counts[[x]])
##   return(subset_seurat)
## }
vec=c(1,2,3,4)
filtered_subproject_seurat_obj_list <- lapply(vec, subset_seurat_obj)

filtered_bcll.combined <- Reduce(merge, filtered_subproject_seurat_obj_list)
# Output a logical vector for every gene on whether the more than zero counts per cell
# Extract counts
counts <- GetAssayData(object = filtered_bcll.combined, slot = "counts")

# Output a logical vector for every gene on whether the more than zero counts per cell
nonzero <- counts > 0

# Sums all TRUE values and returns TRUE if more than max_genes TRUE values per gene
keep_genes <- Matrix::rowSums(nonzero) >= max_genes

# Only keeping those genes expressed in more than max_genes cells
filtered_counts <- counts[keep_genes, ]

# Reassign to filtered Seurat object
filtered_bcll.combined <- CreateSeuratObject(filtered_counts, meta.data = filtered_bcll.combined@meta.data)
cells.use = rownames(filtered_bcll.combined@meta.data)
filtered_bcll.combined[["ADT"]] <- subset(bcll.combined[["ADT"]], cells = cells.use)

15 Create clean metadata

# Save filtered subset to new metadata
metadata_clean <- filtered_bcll.combined@meta.data

Re_QC_summary <- filtered_bcll.combined@meta.data %>% group_by(orig.ident) %>% summarise(Total_cells = n(), Mean_RNA_count= mean(nCount_RNA), Mean_feature_count = mean(nFeature_RNA), Mean_mito_percent = mean(percent.mt), Mean_log10GenesPerUMI = mean(log10GenesPerUMI), Number_of_scrublet_doublet = sum(scrublet_predicted_doublet == "True"), Number_of_genotype_doublet = sum(genotype_based_doublet_flag == "T") , Number_of_genotype_unassigned = sum(genotype_based_unassigned_flag == "T"), Number_of_TCR = sum(tcr_flag == "T", na.rm = T) , Number_of_BCR = sum(bcr_flag == "T", na.rm = T) , Number_of_mait_cells = sum(mait_evidence %in% c("TRB:gene","TRA:gene+junction;TRB:gene","TRA:gene","TRA:gene;TRB:gene","TRA:junction;TRB:gene","TRA:junction","TRA:gene+junction"), na.rm = T), Number_of_inkt_cells =sum(inkt_evidence %in% c("TRB:gene","TRA:gene;TRB:gene","TRA:gene"), na.rm = T), .groups = 'drop')
library(knitr)
library(kableExtra)
kable(Re_QC_summary) %>%
  kable_styling("striped", full_width = T)
orig.ident Total_cells Mean_RNA_count Mean_feature_count Mean_mito_percent Mean_log10GenesPerUMI Number_of_scrublet_doublet Number_of_genotype_doublet Number_of_genotype_unassigned Number_of_TCR Number_of_BCR Number_of_mait_cells Number_of_inkt_cells
BCLLATLAS_33_ifZOgenn_TpMNTvBa 2833 2331.804 979.4328 2.977751 0.8957582 3 0 0 317 1133 76 3
BCLLATLAS_33_mLuLpVxi_v0fLyotc 2506 2873.893 1140.8432 2.732257 0.8918550 1 0 0 292 1315 68 0
BCLLATLAS_38_B20O1bh7_VmM99YZJ 2151 1007.170 503.5807 4.717720 0.9080851 3 123 116 205 544 49 1
BCLLATLAS_38_rdFRFhrU_ZdYeOZlf 2477 1259.698 594.3181 4.136973 0.9024274 2 163 79 232 782 56 2
BCLLATLAS_38_WToIzInl_LudU7hVX 1797 1011.050 494.9399 4.320309 0.9050298 1 85 48 313 451 58 2
BCLLATLAS_40_CfdzDgHe_IMOTbrIP 2679 1421.684 656.7962 2.639688 0.9027553 7 0 0 278 852 52 3
BCLLATLAS_40_LxBTpkPO_8TcNpBg4 2904 1401.250 681.2307 2.423353 0.9094013 1 0 0 285 1031 66 1
BCLLATLAS_40_pseMjZsU_qgZNOhOQ 2897 1366.661 650.6966 2.520651 0.9058407 8 0 0 291 1015 66 6
BCLLATLAS_40_ujxNn2kq_lG2VLlYd 2961 1244.923 618.9976 2.604378 0.9114260 1 0 0 282 963 68 2
BCLLATLAS_40_uqJAc4r9_BYScOzxA 2992 1591.382 741.9171 2.340719 0.9046704 3 0 0 291 980 61 3
BCLLATLAS_46_BZEECBEG_GXkc6Q1y 3941 1510.374 612.3644 3.693103 0.8822999 224 0 0 1221 874 288 12
BCLLATLAS_46_HjqdPU0E_aGDmEY5F 2969 1289.834 587.0882 3.489705 0.8962578 5 0 0 707 425 155 5
BCLLATLAS_46_KETfaLdx_Ub1mtE13 4236 1510.265 645.6591 3.506280 0.8900121 176 0 0 1592 1326 375 9
BCLLATLAS_46_SOJZt9kY_qpnv20QN 3312 1579.972 688.8919 3.409893 0.8933073 4 0 0 797 631 205 3
BCLLATLAS_46_XV1SLOR2_HRF5D9A3 3331 1661.469 738.5749 3.173167 0.8967846 9 0 0 983 915 229 7

15.1 UMI counts (transcripts) per cell

# Visualize the number Feature per cell
metadata_clean %>% 
    ggplot(aes(color=sample, x=nFeature_RNA, fill= sample)) + 
    geom_density(alpha = 0.2) + 
    scale_x_log10() + 
    theme_classic() +
    ylab("Cell density") +
    geom_vline(xintercept = 2500)

15.2 Gene detected per cell

# Visualize the number Gene per cell
metadata_clean %>% 
    ggplot(aes(color=sample, x=nCount_RNA, fill= sample)) + 
    geom_density(alpha = 0.2) + 
    scale_x_log10() + 
    theme_classic() +
    ylab("Cell density") +
    geom_vline(xintercept = 200)

15.3 Mitochondrial Counts Ratio

# Visualize the distribution of mitochondrial gene expression detected per cell
metadata_clean %>% 
    ggplot(aes(color=sample, x=percent.mt, fill=sample)) + 
    geom_density(alpha = 15) + 
    theme_classic() +
    geom_vline(xintercept = 20)

15.4 Genotype and scrublet

Cross table to check the doublet called by scrublet and by the genotype package

table(metadata_clean$donor_id, metadata_clean$scrublet_predicted_doublet)
##                               
##                                False  True
##   B20O1bh7_VmM99YZJ_donor0       933     0
##   B20O1bh7_VmM99YZJ_donor1       238     1
##   B20O1bh7_VmM99YZJ_donor2       427     0
##   B20O1bh7_VmM99YZJ_donor3       313     0
##   B20O1bh7_VmM99YZJ_doublet      121     2
##   B20O1bh7_VmM99YZJ_unassigned   116     0
##   BCLL-15-T                    17371   418
##   BCLL-8-T                      5335     4
##   BCLL-9-T                     14413    20
##   rdFRFhrU_ZdYeOZlf_donor0       331     0
##   rdFRFhrU_ZdYeOZlf_donor1       314     0
##   rdFRFhrU_ZdYeOZlf_donor2      1049     0
##   rdFRFhrU_ZdYeOZlf_donor3       541     0
##   rdFRFhrU_ZdYeOZlf_doublet      161     2
##   rdFRFhrU_ZdYeOZlf_unassigned    79     0
##   WToIzInl_LudU7hVX_donor0       399     0
##   WToIzInl_LudU7hVX_donor1       318     0
##   WToIzInl_LudU7hVX_donor2       568     0
##   WToIzInl_LudU7hVX_donor3       379     0
##   WToIzInl_LudU7hVX_doublet       84     1
##   WToIzInl_LudU7hVX_unassigned    48     0

16 Summarise number of cells

sprintf("Total Cells : %i", length(rownames(metadata)))
## [1] "Total Cells : 46345"
sprintf("Total Cells post QC: %i", length(rownames(metadata_clean)))
## [1] "Total Cells post QC: 43986"
sprintf("T Cells (containing TCR): %i", length(rownames(subset(metadata_clean, metadata_clean$tcr_flag == "T"))))
## [1] "T Cells (containing TCR): 8086"
sprintf("B Cells (containing BCR) : %i", length(rownames(subset(metadata_clean, metadata_clean$bcr_flag == "T"))))
## [1] "B Cells (containing BCR) : 13237"
sprintf("Scrublet Doublet : %i", length(rownames(subset(metadata_clean, metadata_clean$scrublet_predicted_doublet == "True"))))
## [1] "Scrublet Doublet : 448"
sprintf("Genotype Doublet : %i", length(rownames(subset(metadata_clean, metadata_clean$genotype_based_doublet_flag == "T"))))
## [1] "Genotype Doublet : 371"
sprintf("Genotype Unassigned : %i", length(rownames(subset(metadata_clean, metadata_clean$genotype_based_unassigned_flag == "T"))))
## [1] "Genotype Unassigned : 243"

16.1 Save the seurat object

saveRDS(filtered_bcll.combined, file = rds_obj)

17 Session Info

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.3.4   gridExtra_2.3      forcats_0.5.1      stringr_1.4.0     
##  [5] purrr_0.3.4        readr_2.1.2        tidyr_1.2.0        tibble_3.1.6      
##  [9] tidyverse_1.3.1    ggplot2_3.3.5      dplyr_1.0.8        SeuratObject_4.0.4
## [13] Seurat_4.1.0       knitr_1.37        
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1          backports_1.4.1       systemfonts_1.0.4    
##   [4] plyr_1.8.6            igraph_1.2.11         lazyeval_0.2.2       
##   [7] splines_4.0.3         listenv_0.8.0         scattermore_0.8      
##  [10] digest_0.6.29         htmltools_0.5.2       fansi_1.0.2          
##  [13] magrittr_2.0.2        tensor_1.5            cluster_2.1.2        
##  [16] ROCR_1.0-11           tzdb_0.2.0            globals_0.14.0       
##  [19] modelr_0.1.8          matrixStats_0.61.0    vroom_1.5.7          
##  [22] svglite_2.1.0         spatstat.sparse_2.1-0 colorspace_2.0-3     
##  [25] rvest_1.0.2           ggrepel_0.9.1         haven_2.4.3          
##  [28] xfun_0.30             crayon_1.5.0          jsonlite_1.8.0       
##  [31] spatstat.data_2.1-2   survival_3.3-1        zoo_1.8-9            
##  [34] glue_1.6.2            polyclip_1.10-0       gtable_0.3.0         
##  [37] webshot_0.5.2         leiden_0.3.9          future.apply_1.8.1   
##  [40] abind_1.4-5           scales_1.1.1          DBI_1.1.2            
##  [43] spatstat.random_2.1-0 miniUI_0.1.1.1        Rcpp_1.0.8.3         
##  [46] viridisLite_0.4.0     xtable_1.8-4          reticulate_1.24      
##  [49] spatstat.core_2.4-0   bit_4.0.4             htmlwidgets_1.5.4    
##  [52] httr_1.4.2            RColorBrewer_1.1-2    ellipsis_0.3.2       
##  [55] ica_1.0-2             farver_2.1.0          pkgconfig_2.0.3      
##  [58] sass_0.4.0            uwot_0.1.11           dbplyr_2.1.1         
##  [61] deldir_1.0-6          utf8_1.2.2            labeling_0.4.2       
##  [64] tidyselect_1.1.2      rlang_1.0.2           reshape2_1.4.4       
##  [67] later_1.3.0           munsell_0.5.0         cellranger_1.1.0     
##  [70] tools_4.0.3           cli_3.2.0             generics_0.1.2       
##  [73] broom_0.7.12          ggridges_0.5.3        evaluate_0.15        
##  [76] fastmap_1.1.0         yaml_2.3.5            goftest_1.2-3        
##  [79] bit64_4.0.5           fs_1.5.2              fitdistrplus_1.1-8   
##  [82] RANN_2.6.1            pbapply_1.5-0         future_1.24.0        
##  [85] nlme_3.1-155          mime_0.12             xml2_1.3.3           
##  [88] compiler_4.0.3        rstudioapi_0.13       plotly_4.10.0        
##  [91] png_0.1-7             spatstat.utils_2.3-0  reprex_2.0.1         
##  [94] bslib_0.3.1           stringi_1.7.6         highr_0.9            
##  [97] lattice_0.20-45       Matrix_1.4-0          vctrs_0.3.8          
## [100] pillar_1.7.0          lifecycle_1.0.1       spatstat.geom_2.3-2  
## [103] lmtest_0.9-39         jquerylib_0.1.4       RcppAnnoy_0.0.19     
## [106] data.table_1.14.2     cowplot_1.1.1         irlba_2.3.5          
## [109] httpuv_1.6.5          patchwork_1.1.1       R6_2.5.1             
## [112] promises_1.2.0.1      KernSmooth_2.23-20    parallelly_1.30.0    
## [115] codetools_0.2-18      MASS_7.3-55           assertthat_0.2.1     
## [118] withr_2.5.0           sctransform_0.3.3     mgcv_1.8-39          
## [121] parallel_4.0.3        hms_1.1.1             grid_4.0.3           
## [124] rpart_4.1.16          rmarkdown_2.13        Rtsne_0.15           
## [127] shiny_1.7.1           lubridate_1.8.0